Cleaning TEA Salaries

Setup

Importing the libraries we need.

library(tidyverse)
library(janitor)
library(readxl)

Import all the salary files

salary_files_list <- list.files(
  "data-raw",
  pattern = ".csv",
  full.names = T
)

salary_files_list
 [1] "data-raw/Staff Salary FTE Report_Statewide_Districts_2014-2015.csv"
 [2] "data-raw/Staff Salary FTE Report_Statewide_Districts_2015-2016.csv"
 [3] "data-raw/Staff Salary FTE Report_Statewide_Districts_2016-2017.csv"
 [4] "data-raw/Staff Salary FTE Report_Statewide_Districts_2017-2018.csv"
 [5] "data-raw/Staff Salary FTE Report_Statewide_Districts_2018-2019.csv"
 [6] "data-raw/Staff Salary FTE Report_Statewide_Districts_2019-2020.csv"
 [7] "data-raw/Staff Salary FTE Report_Statewide_Districts_2020-2021.csv"
 [8] "data-raw/Staff Salary FTE Report_Statewide_Districts_2021-2022.csv"
 [9] "data-raw/Staff Salary FTE Report_Statewide_Districts_2022-2023.csv"
[10] "data-raw/Staff Salary FTE Report_Statewide_Districts_2023-2024.csv"
import_files <- function(file_name) {
  df <- file_name |> 
    read_csv(     #read csv first and skip the TEA heading lines
      skip = 5) |>
    clean_names() |> 
    mutate(fte_count = as.numeric(fte_count),
           total_base_pay = as.numeric(total_base_pay),
           average_base_pay = as.numeric(average_base_pay))
}

tea_salaries <- salary_files_list |> set_names(basename) |> 
  map(import_files) |>  #map on our new function
  list_rbind(names_to = "source")
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 39350 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (10): Region, County, District, District Name, Charter Status, Staff_Cat...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `fte_count = as.numeric(fte_count)`.
Caused by warning:
! NAs introduced by coercion
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 39096 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (10): Region, County, District, District Name, Charter Status, Staff_Cat...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `fte_count = as.numeric(fte_count)`.
Caused by warning:
! NAs introduced by coercion
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 39429 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): Region, County, District, District Name, Charter Status, Staff_Cate...
dbl (3): FTE Count, Total Base Pay, Average Base Pay

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 51617 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (10): Region, County, District, District Name, Charter Status, Staff_Cat...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `fte_count = as.numeric(fte_count)`.
Caused by warning:
! NAs introduced by coercion
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 52126 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): Region, County, District, District Name, Charter Status, Staff_Cate...
dbl (3): FTE Count, Total Base Pay, Average Base Pay

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 52224 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): Region, County, District, District Name, Charter Status, Staff_Cate...
dbl (3): FTE Count, Total Base Pay, Average Base Pay

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 52500 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): Region, County, District, District Name, Charter Status, Staff_Cate...
dbl (3): FTE Count, Total Base Pay, Average Base Pay

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 53092 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (10): Region, County, District, District Name, Charter Status, Staff_Cat...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `fte_count = as.numeric(fte_count)`.
Caused by warning:
! NAs introduced by coercion
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 53521 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): Region, County, District, District Name, Charter Status, Staff_Cate...
dbl (3): FTE Count, Total Base Pay, Average Base Pay

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 53839 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): Region, County, District, District Name, Charter Status, Staff_Cate...
dbl (3): FTE Count, Total Base Pay, Average Base Pay

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tea_salaries

I know that the end of each file has weird extra lines with extra text. Let’s remove all of those lines that don’t have an actual region. Then we check that the files combined correctly by looking at the end of the datatset

tea_salaries <- tea_salaries |> filter(str_length(region) <= 2)

tea_salaries |> tail(50)

Another check that the files combined correctly.

tea_salaries |> group_by(district) |> 
  summarize(total = n())

Another check that the files combined correctly.

tea_salaries$staff_category |> unique()
[1] "TEACHING STAFF"         "SUPPORT STAFF"          "ADMINISTRATIVE STAFF"  
[4] NA                       "PARAPROFESSIONAL STAFF" "AUXILIARY"             
[7] "AUXILIARY STAFF"        "TRADITIONAL ISD/CSD"   

“TRADITIONAL ISD/CSD” shouldn’t be in there. Let’s see where that error is occurring.

tea_salaries |> filter(staff_category == "TRADITIONAL ISD/CSD")

It’s only one row…and it looks like it might be missing data. Let’s look at all the rows for this district.

tea_salaries |> filter(district == "176901")

These look normal…we only want staff_category that is “TEACHING STAFF” anyways so we can disregard that weird row since it has “ADMINISTRATIVE STAFF”.

Now we want to pull the school year from the file name (source column). Then we will get rid of that source column cause we don’t need it anymore and then I need ot factor the years so R understands that the strings have an order.

teaching_staff_salaries <- tea_salaries |> filter(staff_category == "TEACHING STAFF") |> 
  mutate(school_year = substr(source, start = (nchar(source) - 13 + 1), stop = nchar(source)-4))

teaching_staff_salaries <- teaching_staff_salaries |> select(-source) |> 
  mutate(
    school_year = factor(school_year, 
                         levels = c("2014-2015", "2015-2016", "2016-2017", "2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024")))

colnames(teaching_staff_salaries)[colnames(teaching_staff_salaries) == "district"] <- "district_number"

teaching_staff_salaries

District Types and Join

Let’s read in the district types spreadsheet. We also want to clean the names and then change the first column to be district_name so it doesn’t conflict when we join it later.

district_types <- read_xlsx("data-raw/district-type2223.xlsx", sheet = "2223_Data") |> clean_names()

colnames(district_types)[colnames(district_types) == 'district'] <- 'district_name'

district_types

Now I am going to join the district type data frame with the district salary data.

salaries_district_types <- teaching_staff_salaries |>  left_join(district_types, by = "district_number") 

salaries_district_types 
salaries_district_types |> count(district_number, district_name.x, district_name.y) |> count(district_number, sort = T)
salaries_district_types |> filter(district_number == "227824") 
salaries_district_types <- salaries_district_types |> mutate(
  end_year = substr(school_year, start = 6, stop = 9)
)

salaries_district_types

Export

I want to export the above dataframe to use for analysis in my next notebook.

salaries_district_types |> write_rds("data-processed/teaching_staff_salaries.rds")